datadir <- '~/UROP'
source(file.path(datadir, 'R/unsupervised.R'))
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## Attaching SeuratObject
source(file.path(datadir, 'R/analysis.R'))
## Loading required package: ggplot2
## Loading required package: lattice

Introduction

We analyze our model’s performance on simulated spatial datasets of varying pixel resolutions This enables us to evaluate the model’s performance on predicting proportions in a more realistic setting and lets us demonstrate the model’s full mode. The simulated pucks were generated using the procedure outlined by STdeconvolve on MERFISH data from Moffit et al. 2018.

Ground truth data

truth <- readRDS(file.path(datadir, 'Objects/vignette_v1/MERFISH_truth_20um2.rds'))
my_table = truth$annotDf
my_table$Cell_class <- factor(my_table$Cell_class)
n_levels = length(levels(my_table$Cell_class))
my_pal = pals::kelly(n_levels+1)[2:(n_levels+1)]
pres = unique(as.integer(my_table$Cell_class))
pres = pres[order(pres)]
if(n_levels > 21)
  my_pal = pals::polychrome(n_levels)
plot <- ggplot2::ggplot(my_table, ggplot2::aes(x=Centroid_X, y=Centroid_Y)) + ggplot2::geom_point(ggplot2::aes(size = .15, shape=19,color=Cell_class)) +
  ggplot2::scale_color_manual(values = my_pal[pres])+ ggplot2::scale_shape_identity() + ggplot2::theme_bw() + ggplot2::scale_size_identity()
plot

Unsupervised learning on 20um2 resolution

puck <- readRDS(file.path(datadir, '/Objects/vignette_v1/MERFISH_puck_20um2.rds'))
cell_type_info <- initialize.clusters(puck, resolution = 0.15)
gene_list <- puck@counts@Dimnames[[1]]
myRCTD <- create.object(puck, cell_type_info, gene_list_reg = gene_list)
myRCTD <- run.algorithm(myRCTD)
saveRDS(myRCTD, file.path(datadir, 'Objects/vignette_v1/MERFISH_20um2_final.rds'))

Data preprocessing

For the purposes of the analysis, ground truth pixels with a minority UMI proportion of at least 0.2 were considered doublets.

RCTD_list <- readRDS(file.path(datadir, 'Objects/vignette_v1/MERFISH_20um2_final.rds'))
RCTD_pred <- RCTD_list[[length(RCTD_list)]]

pred_results <- RCTD_pred@results$results_df
truth_results <- as.data.frame(truth$gtSpotTopics[rownames(pred_results),])

doublet_certain <- apply(truth_results, MARGIN = 1, function(x) length(which(x >= 0.2)) == 2)
singlet <- apply(truth_results, MARGIN = 1, function(x) max(x) > 0.8)
first_type <- apply(truth_results, MARGIN = 1, function(x) names(which(x == sort(x, decreasing = TRUE)[1]))[1])
second_type <- apply(truth_results, MARGIN = 1, function(x) names(which(x == sort(x, decreasing = TRUE)[2]))[1])

truth_results$spot_class <- 'reject'
truth_results[doublet_certain,]$spot_class <- 'doublet_certain'
truth_results[singlet,]$spot_class <- 'singlet'
truth_results$first_type <- first_type
truth_results$second_type <- second_type
truth_results <- truth_results[,c('spot_class', 'first_type', 'second_type')]

Spatially plot cell types

plot_all_cell_types <- function(results_df, coords, cell_type_names, resultsdir, size=0.15) {
  barcodes = rownames(results_df[results_df$spot_class != "reject" & results_df$first_type %in% cell_type_names,])
  my_table = coords[barcodes,]
  my_table$class = results_df[barcodes,]$first_type
  n_levels = length(levels(my_table$class))
  my_pal = pals::kelly(n_levels+1)[2:(n_levels+1)]
  pres = unique(as.integer(my_table$class))
  pres = pres[order(pres)]
  if(n_levels > 21)
    my_pal = pals::polychrome(n_levels)
  if(n_levels > 36)
    stop("Plotting currently supports at most 36 cell types as colors")
  plot <- ggplot2::ggplot(my_table, ggplot2::aes(x=x, y=y)) + ggplot2::geom_point(ggplot2::aes(size = size, shape=19,color=class)) +
    ggplot2::scale_color_manual(values = my_pal[pres])+ ggplot2::scale_shape_identity() + ggplot2::theme_bw() + ggplot2::scale_size_identity()
  pdf(file.path(resultsdir,"all_cell_types.pdf"))
  invisible(print(plot))
  dev.off()
  return(plot)
}
plot_all_cell_types(RCTD_pred@results$results_df, RCTD_pred@originalSpatialRNA@coords, RCTD_pred@cell_type_info$renorm[[2]], '..', size=0.3)

Cell type assignments

We can begin by looking at singlet assignment, since at this resolution, most pixels will only contain one cell type.

singlet_table <- function(truth_results, pred_results) {
  truth_singlet <- truth_results[truth_results$spot_class == 'singlet',]
  pred_singlet <- pred_results[pred_results$spot_class == 'singlet',]
  common_barcode <- intersect(row.names(truth_singlet), row.names(pred_singlet))
  truth_singlet <- truth_singlet[common_barcode,]
  pred_singlet <- pred_singlet[common_barcode,]
  truth_types = unlist(list(truth_singlet[,'first_type']))
  pred_types = unlist(list(pred_singlet[,'first_type']))
  return(table(truth_types, pred_types))
}
mytable <- singlet_table(truth_results, pred_results)
mytable
##              pred_types
## truth_types     0   1   2   3   4   5   6   7   8
##   Astrocyte     0 268   0   0   0   0   2   0   1
##   Endothelial   2   1 187   0   0   0   0   0   1
##   Ependymal     0   0   0   0   0   0 123   0   0
##   Excitatory  344   1   0   0   0  54   1  74  52
##   Inhibitory   21   1   0   0   2   0   0   0 939
##   Microglia     3   2   2   3   0   0   1   1   1
##   OD Immature   0   1   0   0  96   0   1   0   0
##   OD Mature     0   0   0 267   0   0   0   0   0
##   Pericytes     0   1  22   0   0   0   0   0   0

Visualizing progress across iterations

We can visualize how the cell type assignments change by plotting the assignment of excitatory and inhibitory neurons as a function of their expression of excitatory and inhibitory neuronal marker genes.

First, we can determine the marker genes with the ground truth data.

get_marker_data <- function(cell_type_names, cell_type_means, gene_list) {
  marker_means = cell_type_means[gene_list,]
  marker_norm = marker_means / rowSums(marker_means)
  marker_data = as.data.frame(cell_type_names[max.col(marker_means)])
  marker_data$max_epr <- apply(cell_type_means[gene_list,],1,max)
  colnames(marker_data) = c("cell_type",'max_epr')
  rownames(marker_data) = gene_list
  marker_data$log_fc <- 0
  epsilon <- 1e-9
  for(cell_type in unique(marker_data$cell_type)) {
    cur_genes <- gene_list[marker_data$cell_type == cell_type]
    other_mean = rowMeans(cell_type_means[cur_genes,cell_type_names != cell_type])
    marker_data$log_fc[marker_data$cell_type == cell_type] <- log(epsilon + cell_type_means[cur_genes,cell_type]) - log(epsilon + other_mean)
  }
  return(marker_data)
}
puck <- readRDS(file.path(datadir, '/Objects/vignette_v1/MERFISH_puck_20um2.rds'))
truth <- readRDS(file.path(datadir, 'Objects/vignette_v1/MERFISH_truth_20um2.rds'))

cur_cell_types <- c('Astrocyte', 'Endothelial', 'Ependymal', 'Excitatory', 'Inhibitory', 'OD Immature', 'OD Mature')
cell_type_info_restr = list(t(truth$gtCtGenes))
cell_type_info_restr[[1]] = cell_type_info_restr[[1]][,cur_cell_types]
cell_type_info_restr[[2]] = cur_cell_types; cell_type_info_restr[[3]] = length(cur_cell_types)
de_genes <- get_de_genes(cell_type_info_restr, puck, fc_thresh = 1, expr_thresh = .0001, MIN_OBS = 3)
## get_de_genes: Astrocyte found DE genes: 5
## get_de_genes: Endothelial found DE genes: 13
## get_de_genes: Ependymal found DE genes: 10
## get_de_genes: Excitatory found DE genes: 25
## get_de_genes: Inhibitory found DE genes: 26
## get_de_genes: OD Immature found DE genes: 5
## get_de_genes: OD Mature found DE genes: 8
## get_de_genes: total DE genes: 79
marker_data_de = get_marker_data(cell_type_info_restr[[2]], cell_type_info_restr[[1]], de_genes)

Next, we can plot cell type assignments across iterations.

excitatory_genes <- rownames(marker_data_de)[marker_data_de$cell_type == "Excitatory"]
inhibitory_genes <- rownames(marker_data_de)[marker_data_de$cell_type == "Inhibitory"]
RCTD_list <- readRDS(file.path(datadir, 'Objects/vignette_v1/MERFISH_20um2_final.rds'))

generate_marker_plot <- function(i) {
  results_df <- RCTD_list[[i]]@results$results_df
  barcodes <- rownames(results_df[results_df$spot_class != 'reject' & results_df$first_type == '0', ])
  plot_df <- data.frame(colSums(puck@counts[excitatory_genes, barcodes]) / puck@nUMI[barcodes], colSums(puck@counts[inhibitory_genes, barcodes]) / puck@nUMI[barcodes])
  colnames(plot_df) = c('Excitatory','Inhibitory')
  plot_df$type = "Excitatory"
  plot_df_final <- plot_df[puck@nUMI[barcodes] >= 300,]
  barcodes <- rownames(results_df[results_df$spot_class != 'reject' & results_df$first_type == '8', ])
  plot_df <- data.frame(colSums(puck@counts[excitatory_genes, barcodes]) / puck@nUMI[barcodes], colSums(puck@counts[inhibitory_genes, barcodes]) / puck@nUMI[barcodes])
  colnames(plot_df) = c('Excitatory','Inhibitory')
  plot_df$type = "Inhibitory"
  plot_df_final <- rbind(plot_df_final, plot_df[puck@nUMI[barcodes] >= 300,])
  MULT <- 500
  plot_df_final$Excitatory <- MULT * plot_df_final$Excitatory
  plot_df_final$Inhibitory <- MULT * plot_df_final$Inhibitory
  my_pal = pals::coolwarm(20)
  p <- ggplot2::ggplot(plot_df_final) + 
    ggplot2::geom_point(ggplot2::aes(x=Excitatory,y=Inhibitory, color = type),alpha = 0.2,size=1) + 
    ggplot2::coord_fixed() + ggplot2::theme_classic()  +
    labs(color="Cluster Cell Type") + guides(color = guide_legend(override.aes = list(size = 3,alpha=1))) + 
    scale_color_manual(values=c(my_pal[1], my_pal[20]))+ theme(legend.position="top") + xlab('Excitatory Markers') + ylab('Inhibitory Markers') + scale_x_continuous(breaks = c(0,100,200), limits = c(0,300)) + scale_y_continuous(breaks = c(0,100,200), limits = c(0,300)) + ggtitle(sprintf('Iteration %s', i))
  p
}

plots = list()
for (i in 1:length(RCTD_list)) {
  plots[[i]] <- generate_marker_plot(i)
}

ggarrange(plots[[1]], plots[[3]], plots[[5]], plots[[7]], plots[[9]], plots[[11]], plots[[13]], plots[[15]], plots[[17]], nrow = 3, ncol = 3, common.legend = T)

To quantify the change in cell type assignment across iterations more succinctly, we can plot the mean difference between inhibitory and excitatory gene expression as a function of iteration number for both cell types. The error regions represent one standard deviation.

generate_marker_data <- function(i) {
  results_df <- RCTD_list[[i]]@results$results_df
  barcodes <- rownames(results_df[results_df$spot_class != 'reject' & results_df$first_type == '0', ])
  plot_df <- data.frame(colSums(puck@counts[excitatory_genes, barcodes]) / puck@nUMI[barcodes], colSums(puck@counts[inhibitory_genes, barcodes]) / puck@nUMI[barcodes])
  colnames(plot_df) = c('Excitatory','Inhibitory')
  plot_df$type = "Excitatory"
  plot_df_final <- plot_df[puck@nUMI[barcodes] >= 300,]
  barcodes <- rownames(results_df[results_df$spot_class != 'reject' & results_df$first_type == '8', ])
  plot_df <- data.frame(colSums(puck@counts[excitatory_genes, barcodes]) / puck@nUMI[barcodes], colSums(puck@counts[inhibitory_genes, barcodes]) / puck@nUMI[barcodes])
  colnames(plot_df) = c('Excitatory','Inhibitory')
  plot_df$type = "Inhibitory"
  plot_df_final <- rbind(plot_df_final, plot_df[puck@nUMI[barcodes] >= 300,])
  return(plot_df_final)
}

df <- data.frame(matrix(ncol = 3, nrow = length(RCTD_list)))
colnames(df) <- c('iteration', 'mean', 'sd')
df$iteration <- as.numeric(rownames(df))
for (i in 1:length(RCTD_list)) {
  data <- generate_marker_data(i)
  data$diff <- data$Inhibitory - data$Excitatory
  df[i, 'mean'] <- mean(data[data$type == 'Excitatory', ]$diff)
  df[i, 'sd'] <- sd(data[data$type == 'Excitatory', ]$diff)
}
df$type <- 'Excitatory'
df_final <- df

df <- data.frame(matrix(ncol = 3, nrow = length(RCTD_list)))
colnames(df) <- c('iteration', 'mean', 'sd')
df$iteration <- as.numeric(rownames(df))
for (i in 1:length(RCTD_list)) {
  data <- generate_marker_data(i)
  data$diff <- data$Inhibitory - data$Excitatory
  df[i, 'mean'] <- mean(data[data$type == 'Inhibitory', ]$diff)
  df[i, 'sd'] <- sd(data[data$type == 'Inhibitory', ]$diff)
}
df$type <- 'Inhibitory'

df_final <- rbind(df_final, df)
MULT <- 500
df_final$mean <- MULT * df_final$mean
df_final$sd <- MULT * df_final$sd

my_pal = pals::coolwarm(20)
ggplot(data=df_final, aes(x=iteration, y=mean, ymin=mean-sd, ymax=mean+sd, fill=type)) + 
  geom_line() + 
  geom_ribbon(alpha=0.5) + 
  labs(fill="Cell Type") +
  xlab('Iteration') +
  ylab('Mean Expression Difference (Inhibitory - Excitatory)') +
  theme_minimal() +
  scale_y_continuous(breaks = c(-200,-100,0,100,200), limits = c(-200,200)) +
  scale_fill_manual(values=c(my_pal[1], my_pal[20]))

Comparing marker gene expression and proportion

With the marker gene data, we can also plot the doublet mode generated proportion of excitatory and inhibitory on each pixel in marker gene space. We will only look at pixels where the top two candidate cell types are excitatory and inhibitory neurons.

results_df <- RCTD_list[[length(RCTD_list)]]@results$results_df
barcodes <- rownames(results_df[results_df$spot_class != 'reject' & results_df$first_type == '8' & results_df$second_type == '0', ])
plot_df <- data.frame(colSums(puck@counts[excitatory_genes, barcodes]) / puck@nUMI[barcodes], colSums(puck@counts[inhibitory_genes, barcodes]) / puck@nUMI[barcodes])
colnames(plot_df) = c('Excitatory','Inhibitory')
plot_df$proportion = RCTD_list[[i]]@results$weights_doublet[barcodes, ][, 'first_type']
plot_df_final <- plot_df[puck@nUMI[barcodes] >= 300,]
barcodes <- rownames(results_df[results_df$spot_class != 'reject' & results_df$first_type == '0' & results_df$second_type == '8', ])
plot_df <- data.frame(colSums(puck@counts[excitatory_genes, barcodes]) / puck@nUMI[barcodes], colSums(puck@counts[inhibitory_genes, barcodes]) / puck@nUMI[barcodes])
colnames(plot_df) = c('Excitatory','Inhibitory')
plot_df$proportion = RCTD_list[[i]]@results$weights_doublet[barcodes, ][, 'second_type']
plot_df_final <- rbind(plot_df_final, plot_df[puck@nUMI[barcodes] >= 300,])
MULT <- 500
plot_df_final$Excitatory <- MULT * plot_df_final$Excitatory
plot_df_final$Inhibitory <- MULT * plot_df_final$Inhibitory

ggplot2::ggplot(plot_df_final) + ggplot2::geom_point(ggplot2::aes(x=Excitatory,y=Inhibitory, color = proportion),alpha = 1,size=0.3) + ggplot2::coord_fixed() + ggplot2::scale_shape_identity() + ggplot2::theme_classic() + ggplot2::scale_size_identity() + labs(color="Inhibitory Proportion") + ggplot2::scale_colour_gradientn(colors = pals::brewer.rdbu(20)[2:19]) + xlab('Excitatory Markers') + ylab('Inhibitory Markers') + scale_x_continuous(breaks = c(0,100,200), limits = c(0,300)) + scale_y_continuous(breaks = c(0,100,200), limits = c(0,300))

Finally, treating excitatory and inhibitory neurons as subtypes of one cell type, we can run the unsupervised model’s full mode on the pixels represented by one of the two neuron cell types so that each pixel gets some proportion of each cell type.

We first run the cell subtype model, which is the same as running full mode.

RCTD_list <- readRDS(file.path(datadir, 'Objects/vignette_v1/MERFISH_20um2_final.rds'))
RCTD_pred <- RCTD_list[[length(RCTD_list)]]
gene_list <- RCTD_pred@internal_vars$gene_list_reg
neuron_RCTD <- initialize.subtypes(RCTD_pred, c('0', '5', '7', '8'), resolution = 0.02, gene_list = gene_list)
neuron_RCTD <- run.subtypes(neuron_RCTD)
saveRDS(neuron_RCTD, file.path(datadir, 'Objects/vignette_v1/MERFISH_neurons.rds'))

We can then look at the proportion of the two neuron types in each pixel, plotted in marker gene space.

RCTD_list <- readRDS(file.path(datadir, 'Objects/vignette_v1/MERFISH_neurons.rds'))

results_df <- RCTD_list[[length(RCTD_list)]]@results$weights
barcodes <- rownames(results_df)
plot_df <- data.frame(colSums(puck@counts[excitatory_genes, barcodes]) / puck@nUMI[barcodes], colSums(puck@counts[inhibitory_genes, barcodes]) / puck@nUMI[barcodes])
colnames(plot_df) = c('Excitatory','Inhibitory')
plot_df$proportion = results_df[, 'subtype_0'] / (results_df[, 'subtype_0'] + results_df[, 'subtype_1'])
plot_df_final <- plot_df[puck@nUMI[barcodes] >= 300,]
MULT <- 500
plot_df_final$Excitatory <- MULT * plot_df_final$Excitatory
plot_df_final$Inhibitory <- MULT * plot_df_final$Inhibitory

ggplot2::ggplot(plot_df_final) + ggplot2::geom_point(ggplot2::aes(x=Excitatory,y=Inhibitory, color = proportion),alpha = 1,size=0.3) + ggplot2::coord_fixed() + ggplot2::scale_shape_identity() + ggplot2::theme_classic() + ggplot2::scale_size_identity() + labs(color="Inhibitory Proportion") + ggplot2::scale_colour_gradientn(colors = pals::brewer.rdbu(20)) + xlab('Excitatory Markers') + ylab('Inhibitory Markers') + scale_x_continuous(breaks = c(0,100,200), limits = c(0,300)) + scale_y_continuous(breaks = c(0,100,200), limits = c(0,300))

Unsupervised learning on 50um2 resolution

puck <- readRDS(file.path(datadir, '/Objects/vignette_v1/MERFISH_puck_50um2.rds'))
gene_list <- puck@counts@Dimnames[[1]]
cell_type_info <- initialize.clusters(puck, resolution = 0.6)
myRCTD <- create.object(puck, cell_type_info, gene_list_reg = gene_list)
myRCTD <- run.algorithm(myRCTD, doublet_mode = 'full')
saveRDS(myRCTD, file.path(datadir, 'Objects/vignette_v1/MERFISH_50um2_final.rds'))

Data preprocessing

RCTD_list <- readRDS(file.path(datadir, 'Objects/vignette_v1/MERFISH_50um2_final.rds'))
RCTD_pred <- RCTD_list[[length(RCTD_list)]]
truth <- readRDS(file.path(datadir, 'Objects/vignette_v1/MERFISH_truth_50um2.rds'))

pred_results <- RCTD_pred@results$weights
#for (i in 1:dim(pred_results)[1]) {
#  pred_results[i,] = pred_results[i,] / rowSums(pred_results)[i]
#}
truth_results <- truth$gtSpotTopics[rownames(pred_results),]

Compare to ground truth

Compare gene expression

We make a heatmap of gene expression correlation between the ground truth and predicted gene expression profiles.

pred_gene <- RCTD_pred@cell_type_info$renorm[[1]]
truth_gene <- t(truth$gtCtGenes)
gene_correlation <- cor(pred_gene, truth_gene)
heatmap(gene_correlation)

Compare cell types

We can assign predicted cell types to ground truth cell types by generating a heatmap of proportion correlation. The assignments are boxed.

true_types <- c('Pericytes', 'Microglia', 'Ependymal', 'OD Immature', 'Endothelial', 'OD Mature', 'Astrocyte', 'Excitatory', 'Inhibitory')
pred_types <- c('7', '3', '1', '6', '5', '4', '8', '2', '0')
correlation_matrix <- cor(truth_results, data.matrix(pred_results))
data <- melt(correlation_matrix)
data$diag = FALSE
data[data$Var1 == 'Ependymal' & data$Var2 == '7', ]$diag = TRUE
data[data$Var1 == 'OD Immature' & data$Var2 == '3', ]$diag = TRUE
data[data$Var1 == 'Endothelial' & data$Var2 == '1', ]$diag = TRUE
data[data$Var1 == 'OD Mature' & data$Var2 == '6', ]$diag = TRUE
data[data$Var1 == 'Astrocyte' & data$Var2 == '5', ]$diag = TRUE
data[data$Var1 == 'Excitatory' & data$Var2 %in% c('4','8'), ]$diag = TRUE
data[data$Var1 == 'Inhibitory' & data$Var2 %in% c('2','0'), ]$diag = TRUE
data$diag[!data$diag] <- NA

ggplot(data, aes(factor(Var1, levels=true_types), factor(Var2, levels=pred_types), fill= value)) +
                           geom_tile() +
                           theme_classic() +
                           scale_fill_gradientn(colors = pals::brewer.rdbu(20)[2:19], limits=c(-1,1), name='Correlation') +
                           theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                           xlab('True Cell Type')+ ylab('Predicted Cell Type') + 
               geom_tile(data = data[!is.na(data$diag), ], aes(color = diag), size = 0.7) +
               scale_color_manual(guide = FALSE, values = c('TRUE' = "#D55E00"))

We can also plot the absolute error proportion for each pixel for each cell type. Plotting us lets us see model performance as a function of ground truth cell type.

ependymal_df <- as.data.frame(abs(pred_results[, '7'] - truth_results[, 'Ependymal']))
colnames(ependymal_df) <- 'value'
ependymal_df$type <- 'Ependymal'

od_immature_df <- as.data.frame(abs(pred_results[, '3'] - truth_results[, 'OD Immature']))
colnames(od_immature_df) <- 'value'
od_immature_df$type <- 'OD Immature'

endothelial_df <- as.data.frame(abs(pred_results[, '1'] - truth_results[, 'Endothelial']))
colnames(endothelial_df) <- 'value'
endothelial_df$type <- 'Endothelial'

od_mature_df <- as.data.frame(abs(pred_results[, '6'] - truth_results[, 'OD Mature']))
colnames(od_mature_df) <- 'value'
od_mature_df$type <- 'OD Mature'

astrocyte_df <- as.data.frame(abs(pred_results[, '5'] - truth_results[, 'Astrocyte']))
colnames(astrocyte_df) <- 'value'
astrocyte_df$type <- 'Astrocyte'

excitatory_df <- as.data.frame(abs(pred_results[, '4'] + pred_results[, '8'] - truth_results[, 'Excitatory']))
colnames(excitatory_df) <- 'value'
excitatory_df$type <- 'Excitatory'

inhibitory_df <- as.data.frame(abs(pred_results[, '2'] + pred_results[, '0'] - truth_results[, 'Inhibitory']))
colnames(inhibitory_df) <- 'value'
inhibitory_df$type <- 'Inhibitory'

abserr_df <- rbind(ependymal_df, od_immature_df, endothelial_df, od_mature_df, astrocyte_df, excitatory_df, inhibitory_df)
abserr_df$type <- as.factor(abserr_df$type)

ggplot(abserr_df, aes(x=type, y=value)) + geom_boxplot(width=0.1, outlier.shape = NA) + theme_minimal() + ylab('Absolute Error') + xlab('Cell Type') + ylim(0, 0.4)

Compare with STdeconvolve

We first run STdeconvolve on the dataset.

library(STdeconvolve)
puck <- readRDS(file.path(datadir, '/Objects/vignette_v1/MERFISH_puck_50um2.rds'))
cd <- puck@counts
ldas <- fitLDA(t(cd), Ks = 9)

optLDA <- optimalModel(models = ldas, opt = "min")
results <- getBetaTheta(optLDA, perc.filt = 0.05, betaScale = 1000)
deconProp <- results$theta
deconGexp <- results$beta

Then we can compare to the ground truth.

Gene expression

pred_gene <- t(deconGexp)
truth_gene <- t(truth$gtCtGenes)
gene_correlation <- cor(pred_gene, truth_gene)
heatmap(gene_correlation)

Cell proportions

true_types <- c('Pericytes', 'Microglia', 'Ependymal', 'OD Immature', 'Endothelial', 'OD Mature', 'Astrocyte', 'Excitatory', 'Inhibitory')
pred_types <- c('3', '9', '6', '5', '1', '8', '7', '2', '4')
decon_results <- deconProp[rownames(truth_results), ]
correlation_matrix <- cor(truth_results, decon_results)
data <- melt(correlation_matrix)
data$diag = FALSE
data[data$Var1 == 'Pericytes' & data$Var2 == '3', ]$diag = TRUE
data[data$Var1 == 'OD Immature' & data$Var2 == '9', ]$diag = TRUE
data[data$Var1 == 'Endothelial' & data$Var2 == '6', ]$diag = TRUE
data[data$Var1 == 'OD Mature' & data$Var2 == '5', ]$diag = TRUE
data[data$Var1 == 'Astrocyte' & data$Var2 == '1', ]$diag = TRUE
data[data$Var1 == 'Excitatory' & data$Var2 %in% c('7','8'), ]$diag = TRUE
data[data$Var1 == 'Inhibitory' & data$Var2 %in% c('2','4'), ]$diag = TRUE
data$diag[!data$diag] <- NA

ggplot(data, aes(factor(Var1, levels = true_types), factor(Var2, levels = pred_types), fill= value)) +
                           geom_tile() +
                           theme_classic() +
                           scale_fill_gradientn(colors = pals::brewer.rdbu(20)[2:19], limits=c(-1,1), name='Correlation') +
                           theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                           xlab('True Cell Type')+ ylab('Predicted Cell Type') + 
               geom_tile(data = data[!is.na(data$diag), ], aes(color = diag), size = 0.7) +
               scale_color_manual(guide = FALSE, values = c('TRUE' = "#D55E00"))

We can compare the pixel proportion RMSE between the two methods.

unsupervised_rmse <- as.data.frame(sqrt((
  (pred_results[, '7'] - truth_results[, 'Ependymal']) ^ 2 + 
  (pred_results[, '3'] - truth_results[, 'OD Immature']) ^ 2 + 
  (pred_results[, '1'] - truth_results[, 'Endothelial']) ^ 2 + 
  (pred_results[, '6'] - truth_results[, 'OD Mature']) ^ 2 + 
  (pred_results[, '5'] - truth_results[, 'Astrocyte']) ^ 2 + 
  (pred_results[, '4'] + pred_results[, '8'] - truth_results[, 'Excitatory']) ^ 2 +
  (pred_results[, '2'] + pred_results[, '0'] - truth_results[, 'Inhibitory']) ^ 2
) / 7))
colnames(unsupervised_rmse) <- 'value'
unsupervised_rmse$method <- 'Unsupervised'

decon_rmse <- as.data.frame(sqrt((
  (decon_results[, '3'] - truth_results[, 'Pericytes']) ^ 2 + 
  (decon_results[, '9'] - truth_results[, 'OD Immature']) ^ 2 + 
  (decon_results[, '6'] - truth_results[, 'Endothelial']) ^ 2 + 
  (decon_results[, '5'] - truth_results[, 'OD Mature']) ^ 2 + 
  (decon_results[, '1'] - truth_results[, 'Astrocyte']) ^ 2 + 
  (decon_results[, '7'] + decon_results[, '8'] - truth_results[, 'Excitatory']) ^ 2 +
  (decon_results[, '2'] + decon_results[, '4'] - truth_results[, 'Inhibitory']) ^ 2
) / 7))
colnames(decon_rmse) <- 'value'
decon_rmse$method <- 'STdeconvolve'

rmse <- rbind(unsupervised_rmse, decon_rmse)

ggplot(rmse, aes(x=method, y=value)) + 
    geom_violin(trim=T, alpha=0.1) + geom_boxplot(width=0.1) + theme_minimal() + ylab('RMSE') + xlab('Method')

We can also plot the comparison factored by cell type.

abserr_df <- abserr_df[abserr_df$type != 'Ependymal', ]
abserr_df$type <- as.factor(abserr_df$type)
abserr_df$method <- 'Unsupervised'

od_immature_df <- as.data.frame(abs(decon_results[, '9'] - truth_results[, 'OD Immature']))
colnames(od_immature_df) <- 'value'
od_immature_df$type <- 'OD Immature'

endothelial_df <- as.data.frame(abs(decon_results[, '6'] - truth_results[, 'Endothelial']))
colnames(endothelial_df) <- 'value'
endothelial_df$type <- 'Endothelial'

od_mature_df <- as.data.frame(abs(decon_results[, '5'] - truth_results[, 'OD Mature']))
colnames(od_mature_df) <- 'value'
od_mature_df$type <- 'OD Mature'

astrocyte_df <- as.data.frame(abs(decon_results[, '1'] - truth_results[, 'Astrocyte']))
colnames(astrocyte_df) <- 'value'
astrocyte_df$type <- 'Astrocyte'

excitatory_df <- as.data.frame(abs(decon_results[, '7'] + decon_results[, '8'] - truth_results[, 'Excitatory']))
colnames(excitatory_df) <- 'value'
excitatory_df$type <- 'Excitatory'

inhibitory_df <- as.data.frame(abs(decon_results[, '2'] + decon_results[, '4'] - truth_results[, 'Inhibitory']))
colnames(inhibitory_df) <- 'value'
inhibitory_df$type <- 'Inhibitory'

decon_df <- rbind(od_immature_df, endothelial_df, od_mature_df, astrocyte_df, excitatory_df, inhibitory_df)
decon_df$type <- as.factor(decon_df$type)
decon_df$method <- 'STdeconvolve'

mae <- rbind(abserr_df, decon_df)

ggplot(mae, aes(x=type, y=value, fill=factor(method))) + geom_boxplot(width=0.25, outlier.shape = NA) + theme_minimal() + ylab('Absolute Error') + xlab('Cell Type') + ylim(0, 0.4)

Spatially plot cell type proportions

Mature Oligodendrocytes

plot_puck_continuous(
  RCTD_pred@originalSpatialRNA,
  colnames(RCTD_pred@originalSpatialRNA@counts),
  RCTD_pred@results$weights[,'6'],
  size=3,
  my_pal = pals::brewer.blues(20)[2:20]
)

Ependymal

plot_puck_continuous(
  RCTD_pred@originalSpatialRNA,
  colnames(RCTD_pred@originalSpatialRNA@counts),
  RCTD_pred@results$weights[,'7'],
  size=3,
  my_pal = pals::brewer.blues(20)[2:20]
)

Cell type proportions

Mature Oligodendrocytes

data <- as.data.frame(cbind(pred_results[, '6'], truth_results[, 'OD Mature']))
colnames(data) <- c('pred','truth')

my_pal = pals::coolwarm(20)
p1 <- ggplot(data, aes(x=truth, y=pred, colour="blue")) +
  geom_point(color=my_pal[1]) +
  theme_classic() +
  xlab('True Mature OD Proportion') +
  ylab('Predicted Mature OD Proportion') + 
  scale_y_continuous(breaks = c(0,0.5,1), limits = c(-.03,1.03)) + 
  scale_x_continuous(breaks = c(0,0.5,1), limits = c(-.03,1.03)) +
  theme(legend.position = "none") +
  geom_abline(slope=1, color = my_pal[20])
p1

# proportion RMSE
sqrt(sum((pred_results[, '6'] - truth_results[, 'OD Mature']) ** 2) / length(pred_results[,'6']))
## [1] 0.05866743

Ependymal

data <- as.data.frame(cbind(pred_results[, '7'], truth_results[, 'Ependymal']))
colnames(data) <- c('pred','truth')

my_pal = pals::coolwarm(20)
p2 <- ggplot(data, aes(x=truth, y=pred, colour="blue")) +
  geom_point(color=my_pal[1]) +
  theme_classic() +
  xlab('True Ependymal Proportion') +
  ylab('Predicted Ependymal Proportion') + 
  scale_y_continuous(breaks = c(0,0.5,1), limits = c(-.03,1.03)) + 
  scale_x_continuous(breaks = c(0,0.5,1), limits = c(-.03,1.03)) +
  theme(legend.position = "none") +
  geom_abline(slope=1, color = my_pal[20])
p2

# proportion RMSE
sqrt(sum((pred_results[, '7'] - truth_results[, 'Ependymal']) ** 2) / length(pred_results[,'7']))
## [1] 0.05266362